import numpy as np 
import pandas as pd 
import bokeh
from autoviz.AutoViz_Class import AutoViz_Class
import matplotlib.pyplot as plt
%matplotlib inline
from pycaret.classification import *
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

stellar = pd.read_csv("star_classification.csv")
stellar.head()
obj_ID alpha delta u g r i z run_ID rerun_ID cam_col field_ID spec_obj_ID class redshift plate MJD fiber_ID
0 1.237661e+18 135.689107 32.494632 23.87882 22.27530 20.39501 19.16573 18.79371 3606 301 2 79 6.543777e+18 GALAXY 0.634794 5812 56354 171
1 1.237665e+18 144.826101 31.274185 24.77759 22.83188 22.58444 21.16812 21.61427 4518 301 5 119 1.176014e+19 GALAXY 0.779136 10445 58158 427
2 1.237661e+18 142.188790 35.582444 25.26307 22.66389 20.60976 19.34857 18.94827 3606 301 2 120 5.152200e+18 GALAXY 0.644195 4576 55592 299
3 1.237663e+18 338.741038 -0.402828 22.13682 23.77656 21.61162 20.50454 19.25010 4192 301 3 214 1.030107e+19 GALAXY 0.932346 9149 58039 775
4 1.237680e+18 345.282593 21.183866 19.43718 17.58028 16.49747 15.97711 15.54461 8102 301 3 137 6.891865e+18 GALAXY 0.116123 6121 56187 842

AV = AutoViz_Class()
target = "class"
sep=","
filename = "star_classification.csv"
dft = AV.AutoViz(filename, sep, target, "",
           header=0, verbose=0,
            lowess=False,chart_format='bokeh',max_rows_analyzed=150000,max_cols_analyzed=30)

Shape of your Data Set loaded: (100000, 18)
############## C L A S S I F Y I N G  V A R I A B L E S  ####################
Classifying variables in data set...
    17 Predictors classified...
        1 variables removed since they were ID or low-information variables

################ Multi_Classification VISUALIZATION Started #####################
Time to run AutoViz (in seconds) = 10

clf = setup(stellar, target = 'class')

  Description Value
0 session_id 2110
1 Target class
2 Target Type Multiclass
3 Label Encoded GALAXY: 0, QSO: 1, STAR: 2
4 Original Data (100000, 18)
5 Missing Values 0
6 Numeric Features 15
7 Categorical Features 2
8 Ordinal Features 0
9 High Cardinality Features 0
10 High Cardinality Method None
11 Transformed Train Set (69999, 17)
12 Transformed Test Set (30001, 17)
13 Shuffle Train-Test True
14 Stratify Train-Test False
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU 0
19 Log Experiment 0
20 Experiment Name clf-default-name
21 USI 7bd1
22 Imputation Type simple
23 Iterative Imputation Iteration None
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model None
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model None
28 Unknown Categoricals Handling least_frequent
29 Normalize 0
30 Normalize Method None
31 Transformation 0
32 Transformation Method None
33 PCA 0
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance 0
37 Combine Rare Levels 0
38 Rare Level Threshold None
39 Numeric Binning 0
40 Remove Outliers 0
41 Outliers Threshold None
42 Remove Perfect Collinearity 1
43 Remove Multicollinearity 0
44 Multicollinearity Threshold None
45 Remove Perfect Collinearity 1
46 Columns Removed Due to Multicollinearity []
47 Clustering 0
48 Clustering Iteration None
49 Polynomial Features 0
50 Polynomial Degree None
51 Trignometry Features 0
52 Polynomial Threshold None
53 Group Features 0
54 Feature Selection 0
55 Feature Selection Method classic
56 Features Selection Threshold None
57 Feature Interaction 0
58 Feature Ratio 0
59 Interaction Threshold None
60 Fix Imbalance 0
61 Fix Imbalance Method SMOTE
compare_models()
  Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
catboost CatBoost Classifier 0.9780 0.9949 0.9708 0.9779 0.9779 0.9608 0.9609 9.7320
xgboost Extreme Gradient Boosting 0.9778 0.9945 0.9702 0.9777 0.9777 0.9605 0.9606 11.5800
lightgbm Light Gradient Boosting Machine 0.9774 0.9947 0.9700 0.9773 0.9773 0.9598 0.9599 1.7750
rf Random Forest Classifier 0.9759 0.9940 0.9684 0.9758 0.9757 0.9570 0.9571 3.4440
gbc Gradient Boosting Classifier 0.9754 0.9943 0.9658 0.9753 0.9752 0.9561 0.9563 16.9700
et Extra Trees Classifier 0.9692 0.9912 0.9599 0.9692 0.9689 0.9450 0.9452 1.3390
dt Decision Tree Classifier 0.9631 0.9661 0.9580 0.9632 0.9631 0.9346 0.9346 0.3170
lda Linear Discriminant Analysis 0.8239 0.9172 0.7348 0.8432 0.8082 0.6535 0.6840 0.0870
ridge Ridge Classifier 0.7986 0.0000 0.7028 0.8125 0.7698 0.5997 0.6374 0.0340
nb Naive Bayes 0.7049 0.8602 0.6389 0.6949 0.6694 0.4464 0.4656 0.0320
knn K Neighbors Classifier 0.7045 0.7841 0.5834 0.6960 0.6878 0.4288 0.4413 0.2380
qda Quadratic Discriminant Analysis 0.6456 0.7649 0.5845 0.7237 0.6111 0.3761 0.4298 0.0520
ada Ada Boost Classifier 0.6257 0.7457 0.5851 0.6891 0.6174 0.3470 0.3732 1.3290
lr Logistic Regression 0.6217 0.6551 0.4052 0.5719 0.5174 0.1336 0.2050 5.1160
dummy Dummy Classifier 0.5948 0.5000 0.3333 0.3537 0.4436 0.0000 0.0000 0.0300
svm SVM - Linear Kernel 0.4705 0.0000 0.3512 0.3432 0.3482 0.0171 0.0245 2.3510
<catboost.core.CatBoostClassifier at 0x7fcd1a6bc400>
catboost = create_model('catboost')
  Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9769 0.9946 0.9678 0.9768 0.9767 0.9586 0.9588
1 0.9734 0.9929 0.9621 0.9734 0.9731 0.9524 0.9527
2 0.9771 0.9950 0.9690 0.9770 0.9770 0.9591 0.9593
3 0.9783 0.9954 0.9682 0.9783 0.9781 0.9611 0.9613
4 0.9771 0.9928 0.9680 0.9770 0.9769 0.9591 0.9593
5 0.9754 0.9939 0.9660 0.9753 0.9752 0.9560 0.9562
6 0.9751 0.9928 0.9646 0.9751 0.9749 0.9555 0.9557
7 0.9769 0.9938 0.9681 0.9768 0.9767 0.9586 0.9588
8 0.9700 0.9929 0.9589 0.9698 0.9697 0.9463 0.9465
9 0.9744 0.9929 0.9667 0.9743 0.9743 0.9543 0.9544
Mean 0.9755 0.9937 0.9659 0.9754 0.9753 0.9561 0.9563
SD 0.0023 0.0010 0.0030 0.0023 0.0023 0.0041 0.0041
plt.figure(figsize=(10,8))
plot_model(catboost, plot="auc")
plt.figure(figsize=(10,8))
plot_model(catboost, plot="confusion_matrix")
plt.figure(figsize=(10,8))
plot_model(catboost, plot = 'class_report')
plt.figure(figsize=(10,8))
plot_model(catboost, plot='boundary')
plt.figure(figsize=(10,8))
plot_model(catboost, plot = 'error')
predictions = predict_model(catboost)
predictions.head()
  Model Accuracy AUC Recall Prec. F1 Kappa MCC
0 CatBoost Classifier 0.9738 0.9927 0.9639 0.9737 0.9735 0.9534 0.9536
alpha delta r i run_ID field_ID spec_obj_ID redshift MJD fiber_ID rerun_ID_301 cam_col_1 cam_col_2 cam_col_3 cam_col_4 cam_col_5 cam_col_6 class Label Score
0 150.383087 46.569584 19.551550 18.75569 2830.0 267.0 7.501961e+18 0.463640 56338.0 325.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 GALAXY GALAXY 0.9894
1 15.122005 -9.587355 18.385139 17.79730 1740.0 122.0 7.409813e+17 0.326329 52146.0 506.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 GALAXY GALAXY 0.9492
2 211.023254 46.052670 20.447439 19.36515 2964.0 440.0 7.599883e+18 0.571471 56367.0 211.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 GALAXY GALAXY 0.9925
3 166.308929 28.261070 16.591160 16.24835 5061.0 340.0 2.489408e+18 0.065899 53786.0 158.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 GALAXY GALAXY 0.9959
4 134.809143 32.857159 17.457750 16.98632 3560.0 185.0 1.431164e+18 0.147677 52974.0 530.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 GALAXY GALAXY 0.9926
final_xgboost = finalize_model(catboost)